{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# PyTorch DDP Fashion MNIST Training Example\n",
    "\n",
    "This example demonstrates how to train a convolutional neural network (CNN) to classify images using the [Fashion MNIST](https://github.com/zalandoresearch/fashion-mnist) dataset and [PyTorch Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).\n",
    "\n",
    "This notebook walks you through running that example locally, and how to easily scale PyTorch DDP across multiple nodes with Kubeflow TrainJob."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Install the Kubeflow SDK\n",
    "\n",
    "You need to install the Kubeflow SDK to interact with Kubeflow Trainer APIs:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO (astefanutti): Change to the Kubeflow SDK when it's available.\n",
    "!pip install git+https://github.com/kubeflow/trainer.git@master#subdirectory=sdk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Install the PyTorch Dependencies\n",
    "\n",
    "You also need to install PyTorch and Torchvision to be able to run the example locally:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install torch==2.5.0\n",
    "!pip install torchvision==0.20.0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define the Training Function\n",
    "\n",
    "The first step is to create function to train CNN model using Fashion MNIST data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_fashion_mnist():\n",
    "    import os\n",
    "\n",
    "    import torch\n",
    "    import torch.distributed as dist\n",
    "    import torch.nn.functional as F\n",
    "    from torch import nn\n",
    "    from torch.utils.data import DataLoader, DistributedSampler\n",
    "    from torchvision import datasets, transforms\n",
    "\n",
    "    # Define the PyTorch CNN model to be trained\n",
    "    class Net(nn.Module):\n",
    "        def __init__(self):\n",
    "            super(Net, self).__init__()\n",
    "            self.conv1 = nn.Conv2d(1, 20, 5, 1)\n",
    "            self.conv2 = nn.Conv2d(20, 50, 5, 1)\n",
    "            self.fc1 = nn.Linear(4 * 4 * 50, 500)\n",
    "            self.fc2 = nn.Linear(500, 10)\n",
    "\n",
    "        def forward(self, x):\n",
    "            x = F.relu(self.conv1(x))\n",
    "            x = F.max_pool2d(x, 2, 2)\n",
    "            x = F.relu(self.conv2(x))\n",
    "            x = F.max_pool2d(x, 2, 2)\n",
    "            x = x.view(-1, 4 * 4 * 50)\n",
    "            x = F.relu(self.fc1(x))\n",
    "            x = self.fc2(x)\n",
    "            return F.log_softmax(x, dim=1)\n",
    "\n",
    "    # Use NCCL if a GPU is available, otherwise use Gloo as communication backend.\n",
    "    device, backend = (\"cuda\", \"nccl\") if torch.cuda.is_available() else (\"cpu\", \"gloo\")\n",
    "    print(f\"Using Device: {device}, Backend: {backend}\")\n",
    "\n",
    "    # Setup PyTorch distributed.\n",
    "    local_rank = int(os.getenv(\"LOCAL_RANK\", 0))\n",
    "    dist.init_process_group(backend=backend)\n",
    "    print(\n",
    "        \"Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}\".format(\n",
    "            dist.get_world_size(),\n",
    "            dist.get_rank(),\n",
    "            local_rank,\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # Create the model and load it into the device.\n",
    "    device = torch.device(f\"{device}:{local_rank}\")\n",
    "    model = nn.parallel.DistributedDataParallel(Net().to(device))\n",
    "    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)\n",
    "\n",
    "    # Retrieve the Fashion-MNIST dataset\n",
    "    dataset = datasets.FashionMNIST(\n",
    "        \"./data\",\n",
    "        train=True,\n",
    "        download=True,\n",
    "        transform=transforms.Compose([transforms.ToTensor()]),\n",
    "    )\n",
    "\n",
    "    # Shard the dataset accross workers.\n",
    "    train_loader = DataLoader(\n",
    "        dataset,\n",
    "        batch_size=100,\n",
    "        sampler=DistributedSampler(dataset)\n",
    "    )\n",
    "\n",
    "    # TODO(astefanutti): add parameters to the training function\n",
    "    for epoch in range(1, 3):\n",
    "        model.train()\n",
    "\n",
    "        # Iterate over mini-batches from the training set\n",
    "        for batch_idx, (inputs, labels) in enumerate(train_loader):\n",
    "            # Copy the data to the GPU device if available\n",
    "            inputs, labels = inputs.to(device), labels.to(device)\n",
    "            # Forward pass\n",
    "            outputs = model(inputs)\n",
    "            loss = F.nll_loss(outputs, labels)\n",
    "            # Backward pass\n",
    "            optimizer.zero_grad()\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "            if batch_idx % 10 == 0 and dist.get_rank() == 0:\n",
    "                print(\n",
    "                    \"Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}\".format(\n",
    "                        epoch,\n",
    "                        batch_idx * len(inputs),\n",
    "                        len(train_loader.dataset),\n",
    "                        100.0 * batch_idx / len(train_loader),\n",
    "                        loss.item(),\n",
    "                    )\n",
    "                )\n",
    "\n",
    "    # Wait for the distributed training to complete\n",
    "    dist.barrier()\n",
    "    if dist.get_rank() == 0:\n",
    "        print(\"Training is finished\")\n",
    "\n",
    "    # Finally clean up PyTorch distributed\n",
    "    dist.destroy_process_group()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dry-run the Training Locally\n",
    "\n",
    "We are going to download Fashion MNIST Dataset and start local training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using Device: cpu, Backend: gloo\n",
      "Distributed Training for WORLD_SIZE: 1, RANK: 0, LOCAL_RANK: 0\n",
      "Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.309967\n",
      "Train Epoch: 1 [1000/60000 (2%)]\tLoss: 2.045446\n",
      "Train Epoch: 1 [2000/60000 (3%)]\tLoss: 1.398883\n",
      "Train Epoch: 1 [3000/60000 (5%)]\tLoss: 0.992089\n",
      "Train Epoch: 1 [4000/60000 (7%)]\tLoss: 1.122684\n",
      "Train Epoch: 1 [5000/60000 (8%)]\tLoss: 1.031676\n",
      "Train Epoch: 1 [6000/60000 (10%)]\tLoss: 0.649529\n",
      "Train Epoch: 1 [7000/60000 (12%)]\tLoss: 0.804960\n",
      "Train Epoch: 1 [8000/60000 (13%)]\tLoss: 0.709698\n",
      "Train Epoch: 1 [9000/60000 (15%)]\tLoss: 0.632330\n",
      "Train Epoch: 1 [10000/60000 (17%)]\tLoss: 0.695469\n",
      "Train Epoch: 1 [11000/60000 (18%)]\tLoss: 0.646323\n",
      "Train Epoch: 1 [12000/60000 (20%)]\tLoss: 0.521877\n",
      "Train Epoch: 1 [13000/60000 (22%)]\tLoss: 0.592377\n",
      "Train Epoch: 1 [14000/60000 (23%)]\tLoss: 0.686853\n",
      "Train Epoch: 1 [15000/60000 (25%)]\tLoss: 0.678805\n",
      "Train Epoch: 1 [16000/60000 (27%)]\tLoss: 0.658783\n",
      "Train Epoch: 1 [17000/60000 (28%)]\tLoss: 0.540468\n",
      "Train Epoch: 1 [18000/60000 (30%)]\tLoss: 0.456685\n",
      "Train Epoch: 1 [19000/60000 (32%)]\tLoss: 0.561984\n",
      "Train Epoch: 1 [20000/60000 (33%)]\tLoss: 0.453478\n",
      "Train Epoch: 1 [21000/60000 (35%)]\tLoss: 0.399682\n",
      "Train Epoch: 1 [22000/60000 (37%)]\tLoss: 0.432961\n",
      "Train Epoch: 1 [23000/60000 (38%)]\tLoss: 0.611499\n",
      "Train Epoch: 1 [24000/60000 (40%)]\tLoss: 0.552892\n",
      "Train Epoch: 1 [25000/60000 (42%)]\tLoss: 0.409226\n",
      "Train Epoch: 1 [26000/60000 (43%)]\tLoss: 0.569662\n",
      "Train Epoch: 1 [27000/60000 (45%)]\tLoss: 0.379728\n",
      "Train Epoch: 1 [28000/60000 (47%)]\tLoss: 0.420447\n",
      "Train Epoch: 1 [29000/60000 (48%)]\tLoss: 0.410670\n",
      "Train Epoch: 1 [30000/60000 (50%)]\tLoss: 0.480141\n",
      "Train Epoch: 1 [31000/60000 (52%)]\tLoss: 0.425981\n",
      "Train Epoch: 1 [32000/60000 (53%)]\tLoss: 0.345157\n",
      "Train Epoch: 1 [33000/60000 (55%)]\tLoss: 0.323578\n",
      "Train Epoch: 1 [34000/60000 (57%)]\tLoss: 0.537613\n",
      "Train Epoch: 1 [35000/60000 (58%)]\tLoss: 0.523302\n",
      "Train Epoch: 1 [36000/60000 (60%)]\tLoss: 0.426407\n",
      "Train Epoch: 1 [37000/60000 (62%)]\tLoss: 0.356403\n",
      "Train Epoch: 1 [38000/60000 (63%)]\tLoss: 0.516297\n",
      "Train Epoch: 1 [39000/60000 (65%)]\tLoss: 0.406655\n",
      "Train Epoch: 1 [40000/60000 (67%)]\tLoss: 0.314193\n",
      "Train Epoch: 1 [41000/60000 (68%)]\tLoss: 0.467424\n",
      "Train Epoch: 1 [42000/60000 (70%)]\tLoss: 0.457645\n",
      "Train Epoch: 1 [43000/60000 (72%)]\tLoss: 0.388591\n",
      "Train Epoch: 1 [44000/60000 (73%)]\tLoss: 0.386649\n",
      "Train Epoch: 1 [45000/60000 (75%)]\tLoss: 0.282575\n",
      "Train Epoch: 1 [46000/60000 (77%)]\tLoss: 0.446804\n",
      "Train Epoch: 1 [47000/60000 (78%)]\tLoss: 0.418433\n",
      "Train Epoch: 1 [48000/60000 (80%)]\tLoss: 0.575584\n",
      "Train Epoch: 1 [49000/60000 (82%)]\tLoss: 0.382036\n",
      "Train Epoch: 1 [50000/60000 (83%)]\tLoss: 0.299168\n",
      "Train Epoch: 1 [51000/60000 (85%)]\tLoss: 0.423421\n",
      "Train Epoch: 1 [52000/60000 (87%)]\tLoss: 0.425236\n",
      "Train Epoch: 1 [53000/60000 (88%)]\tLoss: 0.403723\n",
      "Train Epoch: 1 [54000/60000 (90%)]\tLoss: 0.303039\n",
      "Train Epoch: 1 [55000/60000 (92%)]\tLoss: 0.375983\n",
      "Train Epoch: 1 [56000/60000 (93%)]\tLoss: 0.434169\n",
      "Train Epoch: 1 [57000/60000 (95%)]\tLoss: 0.429213\n",
      "Train Epoch: 1 [58000/60000 (97%)]\tLoss: 0.354376\n",
      "Train Epoch: 1 [59000/60000 (98%)]\tLoss: 0.305779\n",
      "Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.437120\n",
      "Train Epoch: 2 [1000/60000 (2%)]\tLoss: 0.464603\n",
      "Train Epoch: 2 [2000/60000 (3%)]\tLoss: 0.284665\n",
      "Train Epoch: 2 [3000/60000 (5%)]\tLoss: 0.369253\n",
      "Train Epoch: 2 [4000/60000 (7%)]\tLoss: 0.468896\n",
      "Train Epoch: 2 [5000/60000 (8%)]\tLoss: 0.388527\n",
      "Train Epoch: 2 [6000/60000 (10%)]\tLoss: 0.474483\n",
      "Train Epoch: 2 [7000/60000 (12%)]\tLoss: 0.373588\n",
      "Train Epoch: 2 [8000/60000 (13%)]\tLoss: 0.443588\n",
      "Train Epoch: 2 [9000/60000 (15%)]\tLoss: 0.449592\n",
      "Train Epoch: 2 [10000/60000 (17%)]\tLoss: 0.363776\n",
      "Train Epoch: 2 [11000/60000 (18%)]\tLoss: 0.400426\n",
      "Train Epoch: 2 [12000/60000 (20%)]\tLoss: 0.282801\n",
      "Train Epoch: 2 [13000/60000 (22%)]\tLoss: 0.288877\n",
      "Train Epoch: 2 [14000/60000 (23%)]\tLoss: 0.549093\n",
      "Train Epoch: 2 [15000/60000 (25%)]\tLoss: 0.359002\n",
      "Train Epoch: 2 [16000/60000 (27%)]\tLoss: 0.322263\n",
      "Train Epoch: 2 [17000/60000 (28%)]\tLoss: 0.289489\n",
      "Train Epoch: 2 [18000/60000 (30%)]\tLoss: 0.279724\n",
      "Train Epoch: 2 [19000/60000 (32%)]\tLoss: 0.452595\n",
      "Train Epoch: 2 [20000/60000 (33%)]\tLoss: 0.334388\n",
      "Train Epoch: 2 [21000/60000 (35%)]\tLoss: 0.340985\n",
      "Train Epoch: 2 [22000/60000 (37%)]\tLoss: 0.247467\n",
      "Train Epoch: 2 [23000/60000 (38%)]\tLoss: 0.439283\n",
      "Train Epoch: 2 [24000/60000 (40%)]\tLoss: 0.270795\n",
      "Train Epoch: 2 [25000/60000 (42%)]\tLoss: 0.283242\n",
      "Train Epoch: 2 [26000/60000 (43%)]\tLoss: 0.377896\n",
      "Train Epoch: 2 [27000/60000 (45%)]\tLoss: 0.264453\n",
      "Train Epoch: 2 [28000/60000 (47%)]\tLoss: 0.328696\n",
      "Train Epoch: 2 [29000/60000 (48%)]\tLoss: 0.294168\n",
      "Train Epoch: 2 [30000/60000 (50%)]\tLoss: 0.421162\n",
      "Train Epoch: 2 [31000/60000 (52%)]\tLoss: 0.306932\n",
      "Train Epoch: 2 [32000/60000 (53%)]\tLoss: 0.297351\n",
      "Train Epoch: 2 [33000/60000 (55%)]\tLoss: 0.261608\n",
      "Train Epoch: 2 [34000/60000 (57%)]\tLoss: 0.413534\n",
      "Train Epoch: 2 [35000/60000 (58%)]\tLoss: 0.433157\n",
      "Train Epoch: 2 [36000/60000 (60%)]\tLoss: 0.390571\n",
      "Train Epoch: 2 [37000/60000 (62%)]\tLoss: 0.242159\n",
      "Train Epoch: 2 [38000/60000 (63%)]\tLoss: 0.347628\n",
      "Train Epoch: 2 [39000/60000 (65%)]\tLoss: 0.321216\n",
      "Train Epoch: 2 [40000/60000 (67%)]\tLoss: 0.285891\n",
      "Train Epoch: 2 [41000/60000 (68%)]\tLoss: 0.401335\n",
      "Train Epoch: 2 [42000/60000 (70%)]\tLoss: 0.357113\n",
      "Train Epoch: 2 [43000/60000 (72%)]\tLoss: 0.321728\n",
      "Train Epoch: 2 [44000/60000 (73%)]\tLoss: 0.266073\n",
      "Train Epoch: 2 [45000/60000 (75%)]\tLoss: 0.235082\n",
      "Train Epoch: 2 [46000/60000 (77%)]\tLoss: 0.329955\n",
      "Train Epoch: 2 [47000/60000 (78%)]\tLoss: 0.351680\n",
      "Train Epoch: 2 [48000/60000 (80%)]\tLoss: 0.509699\n",
      "Train Epoch: 2 [49000/60000 (82%)]\tLoss: 0.281432\n",
      "Train Epoch: 2 [50000/60000 (83%)]\tLoss: 0.262006\n",
      "Train Epoch: 2 [51000/60000 (85%)]\tLoss: 0.432544\n",
      "Train Epoch: 2 [52000/60000 (87%)]\tLoss: 0.332725\n",
      "Train Epoch: 2 [53000/60000 (88%)]\tLoss: 0.313516\n",
      "Train Epoch: 2 [54000/60000 (90%)]\tLoss: 0.266921\n",
      "Train Epoch: 2 [55000/60000 (92%)]\tLoss: 0.279880\n",
      "Train Epoch: 2 [56000/60000 (93%)]\tLoss: 0.329515\n",
      "Train Epoch: 2 [57000/60000 (95%)]\tLoss: 0.379902\n",
      "Train Epoch: 2 [58000/60000 (97%)]\tLoss: 0.252111\n",
      "Train Epoch: 2 [59000/60000 (98%)]\tLoss: 0.267555\n",
      "Training is finished\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "# Set the Torch Distributed env variables so the training function can be run locally in the Notebook.\n",
    "# See https://pytorch.org/docs/stable/elastic/run.html#environment-variables\n",
    "os.environ[\"RANK\"] = \"0\"\n",
    "os.environ[\"LOCAL_RANK\"] = \"0\"\n",
    "os.environ[\"WORLD_SIZE\"] = \"1\"\n",
    "os.environ[\"MASTER_ADDR\"] = \"localhost\"\n",
    "os.environ[\"MASTER_PORT\"] = \"1234\"\n",
    "\n",
    "# Run the training function locally.\n",
    "train_fashion_mnist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scale PyTorch DDP with Kubeflow TrainJob\n",
    "\n",
    "You can use `TrainerClient()` from the Kubeflow SDK to communicate with Kubeflow Trainer APIs and scale your training function across multiple PyTorch training nodes.\n",
    "\n",
    "`TrainerClient()` verifies that you have required access to the Kubernetes cluster.\n",
    "\n",
    "Kubeflow Trainer creates a `TrainJob` resource and automatically sets the appropriate environment variables to set up PyTorch in distributed environment.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from kubeflow.trainer import Trainer, TrainerClient\n",
    "\n",
    "client = TrainerClient()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List the Training Runtimes\n",
    "\n",
    "You can get the list of available Training Runtimes to start your TrainJob. Each Training Runtime shows whether you can use it for pre-training or post-training.\n",
    "\n",
    "Additionally, it migh show available accelerator type and number of available resources."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Runtime(name='torch-distributed', phase='pre-training', accelerator='gpu-tesla-v100-16gb', accelerator_count='4')\n"
     ]
    }
   ],
   "source": [
    "for runtime in client.list_runtimes():\n",
    "    print(runtime)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run the Distributed TrainJob\n",
    "\n",
    "Kubeflow TrainJob will train the above model on 4 PyTorch nodes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "job_name = client.train(\n",
    "    # Use one the of the training runtimes installed on your Kubernetes cluster\n",
    "    runtime_ref=\"torch-distributed\",\n",
    "    trainer=Trainer(\n",
    "        func=train_fashion_mnist,\n",
    "        # Set how many PyTorch nodes you want to use for distributed training.\n",
    "        num_nodes=4,\n",
    "        # Set the resources for each PyTorch node.\n",
    "        resources_per_node={\n",
    "            \"cpu\": 5,\n",
    "            \"memory\": \"16Gi\",\n",
    "            # Comment this to distribute the TrainJob using CPU nodes.\n",
    "            \"nvidia.com/gpu\": 1,\n",
    "        },\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check the TrainJob components\n",
    "\n",
    "You can check the components of TrainJob that's created.\n",
    "\n",
    "Since the TrainJob performs distributed training across 4 nodes, it generates 4 components: `trainer-node-0` .. `trainer-node-3`.\n",
    "\n",
    "You can get the individual status for each of these components."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Component: trainer-node-0, Status: Running, Devices: gpu x 1\n",
      "\n",
      "Component: trainer-node-1, Status: Running, Devices: gpu x 1\n",
      "\n",
      "Component: trainer-node-2, Status: Running, Devices: gpu x 1\n",
      "\n",
      "Component: trainer-node-3, Status: Running, Devices: gpu x 1\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for c in client.get_job(name=job_name).components:\n",
    "    print(f\"Component: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Watch the TrainJob logs\n",
    "\n",
    "We can use the `get_job_logs()` API to get the TrainJob logs.\n",
    "\n",
    "Since we run training on 4 GPUs, every PyTorch node uses 60,000/4 = 15,000 images from the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[trainer-node]: Using Device: cuda, Backend: nccl\n",
      "[trainer-node]: Distributed Training for WORLD_SIZE: 4, RANK: 0, LOCAL_RANK: 0\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
      "100%|██████████| 26.4M/26.4M [00:02<00:00, 12.5MB/s]\n",
      "[trainer-node]: Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
      "100%|██████████| 29.5k/29.5k [00:00<00:00, 214kB/s]\n",
      "[trainer-node]: Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
      "100%|██████████| 4.42M/4.42M [00:01<00:00, 3.50MB/s]\n",
      "[trainer-node]: Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
      "[trainer-node]: Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
      "100%|██████████| 5.15k/5.15k [00:00<00:00, 37.8MB/s]\n",
      "[trainer-node]: Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
      "[trainer-node]: Train Epoch: 1 [0/60000 (0%)]\tLoss: 2.305451\n",
      "[trainer-node]: Train Epoch: 1 [1000/60000 (7%)]\tLoss: 2.056247\n",
      "[trainer-node]: Train Epoch: 1 [2000/60000 (13%)]\tLoss: 2.166955\n",
      "[trainer-node]: Train Epoch: 1 [3000/60000 (20%)]\tLoss: 1.045183\n",
      "[trainer-node]: Train Epoch: 1 [4000/60000 (27%)]\tLoss: 0.767518\n",
      "[trainer-node]: Train Epoch: 1 [5000/60000 (33%)]\tLoss: 0.697382\n",
      "[trainer-node]: Train Epoch: 1 [6000/60000 (40%)]\tLoss: 0.638373\n",
      "[trainer-node]: Train Epoch: 1 [7000/60000 (47%)]\tLoss: 0.667810\n",
      "[trainer-node]: Train Epoch: 1 [8000/60000 (53%)]\tLoss: 0.541413\n",
      "[trainer-node]: Train Epoch: 1 [9000/60000 (60%)]\tLoss: 0.564223\n",
      "[trainer-node]: Train Epoch: 1 [10000/60000 (67%)]\tLoss: 0.425999\n",
      "[trainer-node]: Train Epoch: 1 [11000/60000 (73%)]\tLoss: 0.564535\n",
      "[trainer-node]: Train Epoch: 1 [12000/60000 (80%)]\tLoss: 0.459158\n",
      "[trainer-node]: Train Epoch: 1 [13000/60000 (87%)]\tLoss: 0.545110\n",
      "[trainer-node]: Train Epoch: 1 [14000/60000 (93%)]\tLoss: 0.471710\n",
      "[trainer-node]: Train Epoch: 2 [0/60000 (0%)]\tLoss: 0.520992\n",
      "[trainer-node]: Train Epoch: 2 [1000/60000 (7%)]\tLoss: 0.440295\n",
      "[trainer-node]: Train Epoch: 2 [2000/60000 (13%)]\tLoss: 0.436745\n",
      "[trainer-node]: Train Epoch: 2 [3000/60000 (20%)]\tLoss: 0.359110\n",
      "[trainer-node]: Train Epoch: 2 [4000/60000 (27%)]\tLoss: 0.493791\n",
      "[trainer-node]: Train Epoch: 2 [5000/60000 (33%)]\tLoss: 0.384616\n",
      "[trainer-node]: Train Epoch: 2 [6000/60000 (40%)]\tLoss: 0.529568\n",
      "[trainer-node]: Train Epoch: 2 [7000/60000 (47%)]\tLoss: 0.443400\n",
      "[trainer-node]: Train Epoch: 2 [8000/60000 (53%)]\tLoss: 0.352168\n",
      "[trainer-node]: Train Epoch: 2 [9000/60000 (60%)]\tLoss: 0.431930\n",
      "[trainer-node]: Train Epoch: 2 [10000/60000 (67%)]\tLoss: 0.282820\n",
      "[trainer-node]: Train Epoch: 2 [11000/60000 (73%)]\tLoss: 0.412141\n",
      "[trainer-node]: Train Epoch: 2 [12000/60000 (80%)]\tLoss: 0.367190\n",
      "[trainer-node]: Train Epoch: 2 [13000/60000 (87%)]\tLoss: 0.355502\n",
      "[trainer-node]: Train Epoch: 2 [14000/60000 (93%)]\tLoss: 0.326105\n",
      "[trainer-node]: Training is finished\n"
     ]
    }
   ],
   "source": [
    "_ = client.get_job_logs(job_name, follow=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Delete the TrainJob\n",
    "\n",
    "When TrainJob is finished, you can delete the resource.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "client.delete_job(job_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
